library(data.table)
library(ggplot2)
library(Hmisc)
library(lfe)
library(stringr)
library(readstata13)
library(zoo)


setDataParameters <- function() {
  
  pars <- list()
  pars$fico.break = 740
  pars$ltv.target = .70
  pars$BUNCHING_DELTA = 0.005
  pars$PURCHASE_CHANCE = .10
  
  ## Regulatory parameters
  # From https://www.mercatus.org/system/files/mercatus-barth-primer-capital-standards-v1.pdf
  pars$CR = data.table(year = c(2010,2011,2012,2013,2014,2015,2016,2017),rho_hat = c(.04,.04,.04,.045,.055,.06,0.06,0.06),
                       xi_g = c(.2,.2,.2,.2,.2,.2,.2,.2),xi_j = c(.5,.5,.5,.5,.5,.5,.5,.5)) 
  
  
  return(pars)  
  
}


createData <- function() {
### MAIN FUNCTION ###  
  
  
  # Raw-to-ready
  prepHMDAData()
  prepFannieData()
  prepFreddieData()
  prepBlackKnightData()
  
  # Create aggregated data
  aggregateGSEData()
  aggregateBlackKnightDataForBunchingAndIncome()
  aggregateBlackKnightData()
  aggregateHMDADataForShares()
  construct_rate_instrument(from_raw = T)
  prepCensusData()
  
  
  # Combine it all
  combineData()
}

combineData <- function() {
  
  # GSE Rates
  rates.gse <- fread('../data/working/gse_aggregated.csv')[,!c('AVG_LTV')]
  rates.blk <- fread('../data/working/black_knight_aggregated.csv')
  rates <- rbind(rates.gse,rates.blk)[,!c('N')]
  
  # Shares
  shares <- fread('../data/working/hmda_aggregated_shares.csv')
  
  # Market data from Blackknight
  blk.mkt <- fread('../data/working/black_knight_aggregated_market_variables.csv')
  mkt.sizes <- fread('../data/working/market_sizes.csv')
  mkt <- merge(blk.mkt,mkt.sizes,by=c('CBSA','YEAR','PURPOSE','FICO'))
  mkt[,MARKET_ID := 1:nrow(mkt)]
  
  # Performane data for blackknight
  blk.perf <- fread('../data/working/black_knight_perf_aggregated.csv')
  blk.perf <- blk.perf[complete.cases(blk.perf)]
  
  # Merge the data
  m1 <- merge(shares,rates,by=c('CBSA','YEAR','PURPOSE','FICO','JUMBO','LENDER'))
  m2 <- merge(mkt,m1,by=c('CBSA','YEAR','PURPOSE','FICO'))
  m3 <- merge(m2,blk.perf,by=c('CBSA','YEAR','PURPOSE','FICO'))
  m3 <- m3[!is.infinite(market.log_price.mean)]
  m3 <- m3[!is.infinite(market.log_income.mean)]
    
  # Calculate some market data means (in particular, income and demographics).
  m3[,market.log_income.mean.BAR := mean(market.log_income.mean,na.rm=T),by=c('YEAR')]
  m3[,market.log_price.mean.BAR := mean(market.log_price.mean,na.rm=T),by=c('YEAR')]
  
  # Create a number for each product
  m3[,ones := 1]
  m3 <- m3[order(CBSA,YEAR,PURPOSE,FICO,JUMBO,LENDER)]
  m3[,j := cumsum(ones),by=c('CBSA','YEAR','PURPOSE','FICO')]
  m3$ones <- NULL
  
  # Actual market shares...
  m3[,MARKET_SHARE_DATA := N_LOANS / MARKET_SIZE]
  
  # Drop markets where a rate is missing
  m3[,NON_NA_RATE := sum(RATE),by='MARKET_ID']
  m3 <- m3[!is.na(NON_NA_RATE)]
  m3$NON_NA_RATE <- NULL
  
  # Make sure all the markets have four rows...
  m3[,temp := .N,by=c('CBSA','YEAR','PURPOSE','FICO')]
  m3 <- m3[temp == 4]
  m3$temp <- NULL
  
  # New on OCT 29: N lenders = max(lender type over in market in all years)
  m3[,N_LENDERS := max(N_LENDERS),by=c('CBSA','PURPOSE','FICO','JUMBO','LENDER')]
  
  write.table(m3,'../data/final/estimation_data_oct_29.csv',row.names=F,col.names=T)
  
}


## BK---CBSA
prepBlackKnightData <- function() {
  bk.raw <- fread('../data/raw/Loan.txt')
  
  cols = c('LoanId','State','zip','MortgageTypeId','LoanTypeId','ProductTypeId','InterestTypeId',
           'ClosingMonth','FirstObservedInterestRate','OriginalInterestRate','OriginalTerm',
           'OriginalLoanAmount','OriginalPropertyValue','CLTV','SourceId','PropertyTypeId',
           'BalloonId','BalloonTerm','PurposeOfLoanId','OriginalCreditScore',
           'PrepaymentPenaltylauseId','PAndIFrequencyId','OccupancyId','DTIHousingRatio',
           'Recourse','DocumentationId','MICompanyId','UnderwritingId','NetServiceFee','GuaranteeFee',
           'OriginationMonth','FirstPayMonth','IsJumbo','FirstInterestResetMonth',
           'PIFMonth','PIFId','FirstObservedMonth','IsIO','UpdateDtTm','AgeAtBoarding','TeaserRate',
           'OriginalLTV','JumboAtOriginationFlag','CBSAMetroDivId')  
  
  names(bk.raw) <- cols
  
  # Get date of origination
  # Origination month is months since 12/1979
  # 1 = 1/1980.
  bk.raw[,origination.year := 1980 + floor(OriginationMonth / 12)]
  
  
  # Keep:
  ## Year in 2007:2017
  
  # Keep
  # Year in 2007:2017
  # Fixed rate non-IO, Conventional (Non-FHA, etc.), 
  ColsToKeep <- c('LoanId','zip','CBSAMetroDivId','origination.year','OriginalInterestRate','OriginalTerm','OriginalLoanAmount','OriginalPropertyValue','PropertyTypeId','PurposeOfLoanId','OriginalCreditScore','OccupancyId','DTIHousingRatio',
                  'DocumentationId','IsJumbo','OriginalLTV','JumboAtOriginationFlag')
  toKeep <- bk.raw[!is.na(zip) & OriginalCreditScore != 0 & origination.year %in% 2007:2017 & LoanTypeId == 1 & ProductTypeId %in% c(3,6) & InterestTypeId == 1 & BalloonId == 0 & MortgageTypeId == 1 & SourceId != 3,ColsToKeep,with=F]
  
  toKeep[,zip3 := substr(zip,1,3)]
  
  # Load on the conforming loan limits
  conf.limits <- fread('../data/raw/crosswalk.csv')
  conf.limits[,zip3 := substr(str_pad(zip,width=5,pad='0',side='left'),1,3)]
  conf.limits <- conf.limits[!is.na(conforming_limit)]
  conf.limits <- conf.limits[!duplicated(paste(zip3,year)),c('zip3','year','conforming_limit'),with=F]
  
  with.limits <- merge(toKeep,conf.limits,by.x=c('origination.year','zip3'),by.y=c('year','zip3'),all.x=T)
  with.limits[is.na(conforming_limit),conforming_limit := 417000]
  with.limits <- with.limits[,!c('zip'),with=F]
  
  write.table(x = with.limits,file = '../data/working/black_knight_to_analyze.csv',row.names=F,col.names=T)
}


aggregateBlackKnightData <- function() {
  data.in <- fread('../data/working/black_knight_to_analyze.csv')
  pars <- setDataParameters()
  
  # Keep only jumbo loans and purchases / refis & first mortgages
  toAgg <- data.in[JumboAtOriginationFlag == 1 & PropertyTypeId == 1 & PurposeOfLoanId %in% c(1,4,5,6) & OriginalTerm == 360]
  toAgg[,purpose := 'r']
  toAgg[PurposeOfLoanId == 1,purpose := 'p']
  toAgg[,fico := 'l']
  toAgg[OriginalCreditScore >= pars$fico.break,fico := 'h']
  
  # Need to project out LTV
  model <- lm(OriginalInterestRate ~ OriginalLTV,data=toAgg )
  coeff <- model$coefficients['OriginalLTV']
  toAgg[,LTV.Deviation := OriginalLTV - pars$ltv.target]
  toAgg[,rate.adjusted := OriginalInterestRate - coeff * LTV.Deviation ]
  
  
  aggregated <- toAgg[,j=list(lender = 'b',jumbo = 'j',rate = mean(rate.adjusted * 100,na.rm=T),avg.fico = mean(OriginalCreditScore,na.rm=T),N = .N),by=c('CBSAMetroDivId','fico','purpose','origination.year')]
  aggregated <- aggregated[,c('CBSAMetroDivId','origination.year','purpose','fico','lender','jumbo','rate','N','avg.fico'),with=F]
  names(aggregated) <- c('CBSA','YEAR','PURPOSE','FICO','LENDER','JUMBO','RATE','N','AVG_FICO')
  write.table(aggregated,'../data/working/black_knight_aggregated.csv',row.names=F,col.names=T)
}

## FF---ZIP --> CBSA
prepFannieData <- function() {
  all.files <- dir('../data/raw/FF_zips/fannie/')
  zip2cbsa <- fread('../data/raw/ZIP_CBSA_032010.csv')
  zip2cbsa <- zip2cbsa[order(ZIP,-TOT_RATIO)]
  zip2cbsa[,zip := substr(str_pad(ZIP,width=5,pad='0',side='left'),1,3)]
  zip2cbsa <- zip2cbsa[!duplicated(zip)]
  

  for(ff in 1:length(all.files)) {
    print(all.files[ff])
    f.in <- fread(paste0('../data/raw/FF_zips/fannie/',all.files[ff]))
    names(f.in) <- c('id','channel','seller','rate','upb','term','orig.date','first.pay.adate','ltv','cltv','n.borrowers',
                     'dti','fico','first.time','purpose','property.type','units','occupancy.type','state','zip','insurance',
                     'product.type','co.borrower.fico','insurance.type','relocation')
    f.in[,zip := str_pad(zip,width=3,pad='0',side='left')]
    with.cbsa <- merge(f.in,zip2cbsa[,c('zip','CBSA')],by='zip')

    # Keep only owner occupied single family
    with.cbsa <- with.cbsa[occupancy.type == 'P' & property.type == 'SF' & term == 360]
    
    if(ff == 1) {
      result <- with.cbsa
    } else {
      result <- rbind(result,with.cbsa)
    }
      
  }
  
  # Load classifications
  classification <- fread('../data/raw/fannie_classifications.csv',header = T)
  classification$V3 <- NULL
  
  # Get year
  result[,YEAR := year(as.Date(paste0('01/',orig.date),format='%d/%m/%Y'))]
  result <- result[YEAR %in% 2007:2017,c('seller','rate','orig.date','YEAR','ltv','cltv','fico','dti','purpose','CBSA'),with=F]
  result <- merge(result,classification,by='seller')
  
  write.table(result,'../data/working/fannie_to_analyze.csv',row.names=F,col.names=T)
}



prepFreddieData <- function() {
  freddie.files <- dir('../data/raw/FF_zips/freddie/',pattern = 'historical_data1_Q.*txt')
  
  for(ff in 1:length(freddie.files)) {
    print(freddie.files[ff])
    f.in <- fread(paste0('../data/raw/FF_zips/freddie/',freddie.files[ff]))
    names(f.in) <- c('fico','first.payment.date','first.time','mat.date','msa','insur','num.units','occupancy',
                     'cltv','dti','ubp','ltv','rate','channel','prepayment.penalty','product.type','state','property.type',
                     'zip','id','purpose','original.term','n.borrowers','seller','servicer','superconforming')
  
    
    # Keep only owner occupied single family
    f.in <- f.in[num.units == 1 & occupancy == 'P' & property.type == 'SF' & original.term == 360]
  
    if(ff == 1) {
      result <- f.in
    } else {
      result <- rbind(result,f.in)
    }
    
  }
  
  # Load classifications
  classification <- fread('../data/raw/freddie_classifications.csv')
  classification <- rbind(classification,data.table(seller = 'Other sellers',lender = 'o'))
  result[,orig.date := mat.date - 100 * 360/12]
  result[,YEAR := as.numeric(substr(orig.date,1,4))]
  
  # The "MSAs" in Freddie appear to be CBSAs
  result[,CBSA := msa]
  
  result <- result[YEAR %in% 2007:2017,c('seller','rate','orig.date','YEAR','ltv','cltv','fico','dti','purpose','CBSA'),with=F]
  result <- merge(result,classification,by='seller')
  
  write.table(result,'../data/working/freddie_to_analyze.csv',row.names=F,col.names=T)
  
}


aggregateGSEData <- function() {
  
  fannie.in  <- fread('../data/working/fannie_to_analyze.csv')
  freddie.in <- fread('../data/working/freddie_to_analyze.csv')
  pars <- setDataParameters()
  
  stacked <- rbind(fannie.in,freddie.in)
  
  stacked[,PURPOSE := 'r']
  stacked[purpose == 'P',PURPOSE := 'p']
  
  stacked[,FICO := 'l']
  stacked[fico >= pars$fico.break,FICO := 'h']
  
  # Remove bad ltvs
  stacked <- stacked[ltv != 999]
  
  
  # Project out ltv
  model <- lm(rate ~ ltv,data=stacked )
  coeff <- model$coefficients['ltv']
  stacked[,LTV.Deviation := ltv - 100*pars$ltv.target]
  stacked[,rate.adjusted := rate - coeff * LTV.Deviation ]
  
  # Aggregate it
  aggregated <- stacked[lender != 'o',j=list(jumbo = 'c',rate = mean(rate.adjusted,na.rm=T),N = .N,avg.fico = mean(fico,na.rm=T),avg.ltv = mean(ltv,na.rm=T)),by=c('CBSA','YEAR','FICO','PURPOSE','lender')]
  names(aggregated) <- c('CBSA','YEAR','FICO','PURPOSE','LENDER','JUMBO','RATE','N','AVG_FICO','AVG_LTV')
  aggregated <- aggregated[,c('CBSA','YEAR','PURPOSE','FICO','LENDER','JUMBO','RATE','N','AVG_FICO','AVG_LTV'),with=F]
  
  # Write it
  write.table(aggregated,'../data/working/gse_aggregated.csv',row.names=F,col.names=T)
}



prepHMDAData <- function() {
  
  hmda.in <- fread('../data/raw/loans07to17.csv')
  classification <- load_avery()[type %in% c('B','S')] # Remove non-banks / non-shadow banks (e.g., thrifts and credit unions)
  with.avery <- merge(hmda.in[,!c('rssdid')],classification,by.x=c('respondent_id','year'),by.y=c('hmprid','year'))
  
  # Remove the loans we don't want. Conventional; refi or purchase, single family, owner occupied, first lien
  correct.loans <- with.avery[loan_type == 1 & loan_purpose %in% c(1,3) & property_type == 1 & owner_occupancy == 1 & lien_status == 1]
  
  # Add the classification for shadow bank fintech
  sb.classification <- fread('../data/raw/shadow_banks_hmda_classification.csv')
  fintechs <- sb.classification[Classification == 'Fintech']$respondent_name
  correct.loans[, lender := 'b']
  correct.loans[type == 'S',lender := 'n']
  correct.loans[respondent_name %in% fintechs, lender := 'f']
  
  # HMDA also appears to be CBSA, not msa
  
  correct.columns <- correct.loans[,c('year','respondent_id','rssdid','lender','msa','conforming_limit','loan_purpose','loan_amount','income','purchaser_type'),with=F]
  write.table(correct.columns,'../data/working/hmda_to_analyze.csv',row.names=F,col.names=T,sep='|')
  
  
 
}


aggregateHMDADataForShares <- function() {
  
  # Load data
  data.in <- fread('../data/working/hmda_to_analyze.csv')
  data.in[,PURPOSE := 'r']
  data.in[loan_purpose == 1,PURPOSE := 'p']
  data.in[,LENDER := lender]
  data.in[,JUMBO := 'c']
  data.in[loan_amount / conforming_limit > 1.005,JUMBO := 'j']
  data.in[,CBSA := as.integer(msa)]
  data.in[,YEAR := year]
  
  # A trick is that we need the number of loans for high and low fico by lender type, but we don't observe FICO in HMDA.
  gse.aggregated <- fread('../data/working/gse_aggregated.csv')
  blk.aggregated <- fread('../data/working/black_knight_aggregated.csv')
  rate.data <- rbind(gse.aggregated,blk.aggregated)
  rate.data[,total.by.c.y.p.l.t := sum(N),by=c('CBSA','YEAR','PURPOSE','LENDER','JUMBO')]
  rate.data[,share.by.c.y.p.l.t := N / total.by.c.y.p.l.t]
  
  # Unique shadow bank non-fintechs per year, for supply estimation
  unique.nb.by.year <- data.in[LENDER == 'n',j=list(n = length(unique(respondent_id))),by='YEAR']
  
  # Gets number of loans and number of lenders for each type.
  aggregated <- data.in[!is.na(CBSA),j=list(N_LENDERS = length(unique(respondent_id)),N_LOANS_TOTAL = .N),by=c('CBSA','YEAR','PURPOSE','LENDER','JUMBO')]
  
  aggregated.h <- aggregated
  aggregated.l <- aggregated
  aggregated.h$FICO <- 'h'
  aggregated.l$FICO <- 'l'
  hmda.aggregated <- rbind(aggregated.h,aggregated.l)
  
  # A trick is that we need the number of loans for high and low fico by lender type, but we don't observe FICO in HMDA.
  # To solve, merge on rates, which has market shares of each FICO type by product/lender etc.
  merged <- merge(hmda.aggregated,rate.data,by=c('CBSA','YEAR','PURPOSE','LENDER','JUMBO','FICO'))
  merged[,N_LOANS := N_LOANS_TOTAL * share.by.c.y.p.l.t]
  
  # Only keep markets with non-degenerate market shares in everything, and only jumbo loans by banks.... (happens due to black knight)
  merged[,n.offerings := .N,by=c('CBSA','YEAR','PURPOSE','FICO')]
  merged <- merged[n.offerings == 4] # Need complete markets
  
  # Keep only the necessary columns
  toKeep <- merged[,c('CBSA','YEAR','PURPOSE','FICO','LENDER','JUMBO','N_LENDERS','N_LOANS'),with=F]
  
  write.table(toKeep,'../data/working/hmda_aggregated_shares.csv',row.names=F,col.names=T)
  
}


aggregateBlackKnightDataForBunchingAndIncome <- function() {
  
  
  data.in <- fread('../data/working/black_knight_to_analyze.csv')
  pars <- setDataParameters()
  
  # Keep only jumbo loans and purchases / refis & first mortgages
  toAgg <- data.in[PropertyTypeId == 1 & PurposeOfLoanId %in% c(1,4,5,6) & OriginalTerm == 360]
  toAgg[,purpose := 'r']
  toAgg[PurposeOfLoanId == 1,purpose := 'p']
  toAgg[,fico := 'l']
  toAgg[OriginalCreditScore >= pars$fico.break,fico := 'h']
  
  # Get right names
  toAgg[, CBSA := CBSAMetroDivId]
  toAgg[, YEAR := origination.year]
  toAgg[, PURPOSE := purpose]
  toAgg[, FICO := fico]
  
  # Implied income and house prices
  toAgg[,monthly.payment := OriginalLoanAmount * ( (OriginalInterestRate / 12) * (1 + OriginalInterestRate / 12)^OriginalTerm ) / ((1 + OriginalInterestRate / 12)^OriginalTerm - 1)]
  toAgg[,monthly.income  := monthly.payment / (DTIHousingRatio) * 100]
  toAgg[,annual.income   := monthly.income * 12]
  
  # Bunching
  toAgg[,cnf.pct := OriginalLoanAmount / conforming_limit]
  toAgg[,bunching      := as.integer(cnf.pct > 1-pars$BUNCHING_DELTA  & cnf.pct < 1+pars$BUNCHING_DELTA )]
  toAgg[,bunching.ring := as.integer( (cnf.pct > .9 & cnf.pct < 1-pars$BUNCHING_DELTA) | (cnf.pct > 1+pars$BUNCHING_DELTA & cnf.pct < 1.1))]
  toAgg[,just.above := as.integer( cnf.pct > 1.001 & cnf.pct <= 1.05)]
  toAgg[,just.below := as.integer( cnf.pct <= 1.001 & cnf.pct > 0.95)]
  
  # Throw out bad obs
  toAgg <- toAgg[annual.income < 1e6 & OriginalPropertyValue < 1e7 & OriginalLoanAmount < 1e7]
  
  # Aggregate it
  aggregated_market <- toAgg[!is.na(CBSA),j=list(market.log_income.mean = mean(log(annual.income),na.rm=T),
                              market.log_income.std  =   sd(log(annual.income),na.rm=T),
                              market.log_price.mean  = mean(log(OriginalPropertyValue),na.rm=T),
                              market.log_price.std   =   sd(log(OriginalPropertyValue),na.rm=T),
                              market.log_loan_size.mean = mean(log(OriginalLoanAmount),na.rm=T),
                              market.log_loan_size.std  =   sd(log(OriginalLoanAmount),na.rm=T),
                              market.bunching        = mean(bunching,na.rm=T),
                              market.bunching.income.delta = weighted.mean(annual.income,w=bunching,na.rm=T) - weighted.mean(annual.income,w=bunching.ring,na.rm=T),
                              market.just.above = mean(just.above,na.rm=T),
                              market.just.below = mean(just.below,na.rm=T),
                              market.conforming_loan_limit = conforming_limit[1])
                             ,by=c('CBSA','YEAR','PURPOSE','FICO')]
  
  # If there are no bunching loans...
  aggregated_market[is.na(market.bunching.income.delta),market.bunching.income.delta := 0]
  
  # Within market need correlation between income, house price
  # Will keep this relationship constant for everyone.
  correlation <- cor(x=log(toAgg[annual.income > 0 & OriginalPropertyValue > 0]$annual.income),y=log(toAgg[annual.income > 0 & OriginalPropertyValue > 0]$OriginalPropertyValue),use = 'complete.obs')
  aggregated_market[,market.income_price.corr := correlation]
  
  
  write.table(aggregated_market, '../data/working/black_knight_aggregated_market_variables.csv',row.names=F,col.names=T)
}




prepCensusData <- function() {
  d.in <- fread('../data/raw/nhgis0006_ds215_20155_2015_cbsa.csv')
  
  pars <- setDataParameters()
  
  # Market size for purchases is 1/10 the number of owner occupied houses (Regardless of mortgage status)
  # Market size for refis is the number of households with mortgages.
  
  d.in[,CBSA := CBSAA]
  d.in[,households := ADR0E001]
  d.in[,mortgages := ADR0E002]
  
  #  Remember, we need to allocate fraction in high and low fico...
  gse.aggregated <- fread('../data/working/gse_aggregated.csv')
  blk.aggregated <- fread('../data/working/black_knight_aggregated.csv')
  rate.data <- rbind(gse.aggregated,blk.aggregated)
  by_cbsa_year_purpose <- rate.data[,j=list(N = sum(N,na.rm=T)),by=c('CBSA','YEAR','PURPOSE','FICO')] 
  by_cbsa_year_purpose[,TOTAL := sum(N),by=c('CBSA','YEAR','PURPOSE')]
  by_cbsa_year_purpose[,frac := N / TOTAL]
  
  # Market sizes
  pur <- d.in[,c('CBSA','households')]
  names(pur) <- c('CBSA','MARKET_SIZE_RAW')
  pur[,MARKET_SIZE := MARKET_SIZE_RAW * pars$PURCHASE_CHANCE]
  pur$PURPOSE = 'p'
  pur1 <- pur
  pur2 <- pur
  pur1$FICO = 'h'
  pur2$FICO = 'l'
  
  ref <- d.in[,c('CBSA','mortgages')]
  names(ref) <- c('CBSA','MARKET_SIZE_RAW')
  ref[,MARKET_SIZE := MARKET_SIZE_RAW]
  ref$PURPOSE = 'r'
  ref1 <- ref
  ref2 <- ref
  ref1$FICO <- 'h'
  ref2$FICO <- 'l'
  
  # Combined
  combined <- rbind(pur1,pur2,ref1,ref2)
  
  # Merge on the high and low fico thing
  cc <- merge(combined,by_cbsa_year_purpose,by=c('CBSA','PURPOSE','FICO'))
  cc[,MARKET_SIZE_RAW := MARKET_SIZE_RAW * frac]
  
  ready.to.go <- cc[,c('CBSA','YEAR','PURPOSE','FICO','MARKET_SIZE')]
  
  write.table(ready.to.go,'../data/working/market_sizes.csv',row.names=F,col.names=T)
  
  
}

construct_rate_instrument <- function(from_raw = T) {
  
  if(from_raw) {
    all.loan.months <- dir('../../blackknight/loan_month/',pattern = 'txt')
    
    for(ff in 1:length(all.loan.months)) {
      cf <- all.loan.months[ff]
      print(cf)
      data.in <- fread(paste0('../../blackknight/loan_month/',cf))
      names(data.in) <- c('LoanId','AsOfMonth','LoanAge','InterestType','InterestRate',
                          'PriorInterestRate','UPB','NextPaymentDueMonth','ForeclosureId',
                          'BackruptcyFlag','BankruptcyChapterId','PIFId','InvestorId',
                          'PandIConstant','InLossMitigation','CurrentCreditScore',
                          'RemainingTerm','PaymentStatus','TAndIConstant','SPB',
                          'IsIOPayment','IsNegAmortPayment','IsPrepayment','PrepaymentAmount',
                          'NegAmortAmount','ChargeoffAmount','PriorPaymentStatus',
                          'HasPriorMonth','UpdateDtTm','CurrentJumboFlag','UPBtoSPB')
    
    
      # Get loans that are exactly 12 months old and fixed rate...
      year.old <- data.in[LoanAge == 12 & InterestType == 1]
      year.old[,ReportDate := as.Date(as.yearmon(as.Date('19791201',format='%Y%m%d')) + AsOfMonth/12)]
      perf     <- year.old[,c('LoanId','ReportDate','InterestRate','UPB','PaymentStatus')]
      perf[,delinquent := as.integer(PaymentStatus %in% c('R','B','F','2','3','4'))]
      
      # Write it
      write.table(perf,paste0('../data/working/black_knight_perf/',cf),row.names=F,col.names=T)
    }
    
    
    # Load and combine them all
    files <- dir('../data/working/black_knight_perf/')
    for(ff in 1:length(files)) {
      f.in <- fread(paste0('../data/working/black_knight_perf/',files[ff]))
      if(ff == 1) {
        all.perf <- f.in
      } else {
        all.perf <- rbind(all.perf,f.in)
      }
    }
    write.table(all.perf,'../data/working/blackknight_combined_perf.csv',row.names=F,col.names=T)
  
  }
  
  pars <- setDataParameters()
  
  # load BK data
  perf.in <- fread('../data/working/blackknight_combined_perf.csv')
  orig.in <- fread('../data/working/black_knight_to_analyze.csv')
  
  # Merged
  merged <- merge(orig.in,perf.in,by='LoanId')
  toAgg <- merged[JumboAtOriginationFlag == 1 & PropertyTypeId == 1 & PurposeOfLoanId %in% c(1,4,5,6) & OriginalTerm == 360]
  toAgg[,purpose := 'r']
  toAgg[PurposeOfLoanId == 1,purpose := 'p']
  toAgg[,fico := 'l']
  toAgg[OriginalCreditScore >= pars$fico.break,fico := 'h']
  
  # Get right names
  toAgg[, CBSA := CBSAMetroDivId]
  toAgg[, YEAR := origination.year]
  toAgg[, PURPOSE := purpose]
  toAgg[, FICO := fico]
  
  aggregated <- toAgg[,j=list(D_60_RATE = mean(delinquent,na.rm=T)),by=c('CBSA','PURPOSE','FICO','YEAR')]
  aggregated <- aggregated[order(CBSA,PURPOSE,FICO,YEAR)]
  aggregated[,D60.L1 := shift(D_60_RATE,type = 'lag',n = 1),by=c('CBSA','PURPOSE','FICO')]
  aggregated[,D60.L2 := shift(D_60_RATE,type = 'lag',n = 2),by=c('CBSA','PURPOSE','FICO')]
  
  write.table(aggregated,'../data/working/black_knight_perf_aggregated.csv',row.names=F,col.names=T)
  
}



load_avery <- function() {
  avery <- data.table(read.dta13('../data/raw/hmdpanel17.dta'))
  
  years = 7:17
  
  for(yy in 1:length(years)) {
    cy = years[yy]
    temp <- avery[,c('hmprid',paste0('ENTITY',str_pad(cy,pad = '0',side = 'left',width = 2)),paste0('TYPE',str_pad(cy,pad = '0',side = 'left',width = 2))),with=F]
    names(temp) <- c('hmprid','rssdid','TYPE')
    temp <- temp[complete.cases(temp[,c('hmprid','TYPE')])]
    temp[,year := cy + 2000]
    temp <- temp[!duplicated(hmprid)]
    temp[,rssdid := str_pad(rssdid,width = 10,side = 'left',pad = '0')]
    temp[,type := 'O']
    temp[TYPE %in% c(40,41) ,type := 'S']
    temp[TYPE %in% 10:14    ,type := 'B']
    if(yy == 1) {
      result <- temp[,c('year','hmprid','rssdid','type')]
    } else {
      result <- rbind(result,temp[,c('year','hmprid','rssdid','type')])
    }
  }
  
  
  return(result)
  
}


createActiveBanksByMarket <- function() {
  
  data.in <- fread('../data/working/hmda_to_analyze.csv')
  data.in[,PURPOSE := 'r']
  data.in[loan_purpose == 1,PURPOSE := 'p']
  data.in[,LENDER := lender]
  data.in[,JUMBO := 'c']
  data.in[loan_amount / conforming_limit > 1.005,JUMBO := 'j']
  data.in[,CBSA := as.integer(msa)]
  data.in[,YEAR := year]
  
  # Keep banks only
  data.in <- data.in[LENDER == 'b']
  
  # Aggregate by CBSA-YEAR-PURPOSE-JUMBO-RSSDID
  market.lender <- data.in[,j=list(N_ORIG = .N,PCT_HELD = mean(purchaser_type %in% c(0,8))),by=c('CBSA','YEAR','PURPOSE','JUMBO','rssdid')]
  
  # Get bank capitalization
  cap.in <- fread('../data/raw/balance_sheet.csv')[,c('rssdid','year','total_assets','cr')]
  names(cap.in) <- c('rssdid','YEAR','BANK_ASSETS','BANK_CR')
  cap.in[,BANK_CR := BANK_CR / 100]
  
  # Merge them
  merged <- merge(market.lender,cap.in,by=c('rssdid','YEAR'))
  
  # Drop some naughty things.
  merged <- merged[complete.cases(merged)]
  merged[JUMBO == 'j',PCT_HELD := 1] # Assume all jumbos are held
  
  # Add in capital requirement rules
  pars <- setDataParameters()
  merged <- merge(merged,pars$CR,by.x='YEAR',by.y='year')
  
  write.table(merged,'../data/final/capitalization_data_sep9.csv',row.names=F,col.names=T)
}
